Global Deaths Due to Air Pollution

Elizabeth Bekele, Alison Cheek

2022-05-03

Introduction

Packages Required

#This will allow us to filter through our data 
library(tidyverse)
library(dplyr)
#This will help us plot figures to showcase our findings
library(ggplot2)
#This will help us organize and display our data as necessary 
library(knitr)
library(kableExtra)
#This expands our plot uses 
library(plotly)

Data Details

Import the deaths-due-to-air-pollution data

deaths_df <- data.frame(read.csv("death-rates-from-air-pollution.csv"))

We are going to rename a few of the columns and glimpse the data

colnames(deaths_df) <- c("country", "acronym", "year", "total_deaths", "indoor_deaths", "outdoor_deaths", "ozone_deaths")

glimpse(deaths_df)
## Rows: 6,468
## Columns: 7
## $ country        <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanist…
## $ acronym        <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG",…
## $ year           <int> 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1…
## $ total_deaths   <dbl> 299.4773, 291.2780, 278.9631, 278.7908, 287.1629, 288.0…
## $ indoor_deaths  <dbl> 250.3629, 242.5751, 232.0439, 231.6481, 238.8372, 239.9…
## $ outdoor_deaths <dbl> 46.44659, 46.03384, 44.24377, 44.44015, 45.59433, 45.36…
## $ ozone_deaths   <dbl> 5.616442, 5.603960, 5.611822, 5.655266, 5.718922, 5.739…

Variables that interest us here include: country, total_deaths, indoor_deaths, outdoor_deaths, ozone_deaths

Data Continued

Now, let’s take a look at the population data.

world_pop <- read.csv("population_total_long.csv")
glimpse(world_pop)
## Rows: 12,595
## Columns: 3
## $ Country.Name <chr> "Aruba", "Afghanistan", "Angola", "Albania", "Andorra", "…
## $ Year         <int> 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 196…
## $ Count        <int> 54211, 8996973, 5454933, 1608800, 13411, 92418, 20481779,…

To get a general idea of ‘deaths-dataframe’ we made, let’s make a plots to see what’s happening. This is a plot of indoor x outdoor deaths around the world by country.

d <- ggplot(deaths_df, aes(x = indoor_deaths, y = outdoor_deaths, text = paste0(country, ", ", year) )) + geom_point() +
  ggtitle("Outdoor Deaths vs Indoor Deaths")
ggplotly(d)

This is a mess, and so we chose two countries from each continent (a high-population and a low-population country) to graph.

Combine Data Sets

First let’s look at a table of the high and low populated countries using the world population data set.

#selecting high-population countries from the world population data frame 
high_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(Year)

high_pop_countries
## # A tibble: 126 × 3
## # Groups:   Year [21]
##    Country.Name   Year     Count
##    <chr>         <int>     <int>
##  1 Australia      1997  18517000
##  2 Brazil         1997 167209040
##  3 Germany        1997  82034771
##  4 Nigeria        1997 113457663
##  5 Pakistan       1997 131057431
##  6 United States  1997 272657000
##  7 Australia      1998  18711000
##  8 Brazil         1998 169785250
##  9 Germany        1998  82047195
## 10 Nigeria        1998 116319759
## # … with 116 more rows
#selecting low-population countries from the world population data frame 
low_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'))%>% 
  group_by(Year)

low_pop_countries
## # A tibble: 126 × 3
## # Groups:   Year [21]
##    Country.Name  Year    Count
##    <chr>        <int>    <int>
##  1 Canada        1997 29905948
##  2 Chile         1997 14786220
##  3 Sri Lanka     1997 18470900
##  4 Malawi        1997 10264906
##  5 New Zealand   1997  3781300
##  6 Serbia        1997  7596501
##  7 Canada        1998 30155173
##  8 Chile         1998 14977733
##  9 Sri Lanka     1998 18564599
## 10 Malawi        1998 10552338
## # … with 116 more rows

Next, we are going to see the death count for high and low populated countries using the deaths dataframe.

#selecting high-population deaths from death dataframe 
high_pop_death <- deaths_df %>% 
  filter(year > 1996 & country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(year)

head(high_pop_death)
## # A tibble: 6 × 7
## # Groups:   year [6]
##   country   acronym  year total_deaths indoor_deaths outdoor_deaths ozone_deaths
##   <chr>     <chr>   <int>        <dbl>         <dbl>          <dbl>        <dbl>
## 1 Australia AUS      1997         22.4         0.322           21.8        0.314
## 2 Australia AUS      1998         21.5         0.284           21.0        0.305
## 3 Australia AUS      1999         20.4         0.259           19.9        0.295
## 4 Australia AUS      2000         19.4         0.240           18.9        0.290
## 5 Australia AUS      2001         18.6         0.223           18.1        0.284
## 6 Australia AUS      2002         18.1         0.211           17.7        0.286
#selecting low-population deaths from death dataframe 
low_pop_death <- deaths_df %>% 
  filter(year > 1996 & country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(year)

head(low_pop_death)
## # A tibble: 6 × 7
## # Groups:   year [6]
##   country acronym  year total_deaths indoor_deaths outdoor_deaths ozone_deaths
##   <chr>   <chr>   <int>        <dbl>         <dbl>          <dbl>        <dbl>
## 1 Canada  CAN      1997         21.9        0.0878           19.9         2.20
## 2 Canada  CAN      1998         21.7        0.0824           19.6         2.21
## 3 Canada  CAN      1999         21.2        0.0751           19.2         2.19
## 4 Canada  CAN      2000         20.3        0.0682           18.3         2.13
## 5 Canada  CAN      2001         19.8        0.0641           17.9         2.08
## 6 Canada  CAN      2002         19.5        0.0605           17.7         2.05

Lastly, we will join the population and and deaths with its respected country.

#Combined High Population with Pollution Death
joined_high <- right_join(high_pop_death, high_pop_countries, by= c('country' = 'Country.Name', 'year' = 'Year'))
head(joined_high)
## # A tibble: 6 × 8
## # Groups:   year [6]
##   country   acronym  year total_deaths indoor_deaths outdoor_deaths ozone_deaths
##   <chr>     <chr>   <int>        <dbl>         <dbl>          <dbl>        <dbl>
## 1 Australia AUS      1997         22.4         0.322           21.8        0.314
## 2 Australia AUS      1998         21.5         0.284           21.0        0.305
## 3 Australia AUS      1999         20.4         0.259           19.9        0.295
## 4 Australia AUS      2000         19.4         0.240           18.9        0.290
## 5 Australia AUS      2001         18.6         0.223           18.1        0.284
## 6 Australia AUS      2002         18.1         0.211           17.7        0.286
## # … with 1 more variable: Count <int>
#Combined Low Population with Pollution Death
joined_low <-right_join(low_pop_death, low_pop_countries, by= c('country' = 'Country.Name', 'year' = 'Year'))
head(joined_low) 
## # A tibble: 6 × 8
## # Groups:   year [6]
##   country acronym  year total_deaths indoor_deaths outdoor_deaths ozone_deaths
##   <chr>   <chr>   <int>        <dbl>         <dbl>          <dbl>        <dbl>
## 1 Canada  CAN      1997         21.9        0.0878           19.9         2.20
## 2 Canada  CAN      1998         21.7        0.0824           19.6         2.21
## 3 Canada  CAN      1999         21.2        0.0751           19.2         2.19
## 4 Canada  CAN      2000         20.3        0.0682           18.3         2.13
## 5 Canada  CAN      2001         19.8        0.0641           17.9         2.08
## 6 Canada  CAN      2002         19.5        0.0605           17.7         2.05
## # … with 1 more variable: Count <int>

Which country has the highest death count?

Let’s make a table depicting the high and low populated countries and their respected death count due to pollution.

#Mean total deaths of high-population countries
deaths_highpop_countries <- deaths_df %>% 
  filter(country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_high = mean(total_deaths))
## Adding missing grouping variables: `country`
#Mean total deaths of high-population countries
deaths_lowpop_countries<- deaths_df %>% 
  filter(country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_low = mean(total_deaths))
## Adding missing grouping variables: `country`
#Side-by-side Table to view deaths 
kable(list(deaths_highpop_countries, deaths_lowpop_countries))
country average_death_high
Australia 17.76815
Brazil 48.42928
Germany 28.10988
Nigeria 112.30157
Pakistan 144.33463
United States 26.35827
country average_death_low
Canada 18.18542
Chile 36.51321
Malawi 147.77167
New Zealand 15.92536
Serbia 80.66558
Sri Lanka 69.60383

Here’s a graph to clearly visualize the previous table

#Plot High Population Deaths (average)
ggplot(deaths_highpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_high))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in high-population countries")+
  coord_flip()

#Plot of Low Population Deaths (average)
ggplot(deaths_lowpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_low))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in low-population countries")+
  coord_flip()

Which type of pollution has the greatest number of deaths?

Pollution Over Time

Which year had the worst pollution?

Which year had the worst indoor? Outdoor particulate? Outdoor ozone?

Which is worse - outdoor or indoor pollution?

First, we split the data into high and low population based on country

Low population = high population * .10

#selecting high-population countries from the world population data frame 
high_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(Year)

high_pop_countries
## # A tibble: 126 × 3
## # Groups:   Year [21]
##    Country.Name   Year     Count
##    <chr>         <int>     <int>
##  1 Australia      1997  18517000
##  2 Brazil         1997 167209040
##  3 Germany        1997  82034771
##  4 Nigeria        1997 113457663
##  5 Pakistan       1997 131057431
##  6 United States  1997 272657000
##  7 Australia      1998  18711000
##  8 Brazil         1998 169785250
##  9 Germany        1998  82047195
## 10 Nigeria        1998 116319759
## # … with 116 more rows
#selecting low-population countries from the world population data frame 
low_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'))%>% 
  group_by(Year)

low_pop_countries
## # A tibble: 126 × 3
## # Groups:   Year [21]
##    Country.Name  Year    Count
##    <chr>        <int>    <int>
##  1 Canada        1997 29905948
##  2 Chile         1997 14786220
##  3 Sri Lanka     1997 18470900
##  4 Malawi        1997 10264906
##  5 New Zealand   1997  3781300
##  6 Serbia        1997  7596501
##  7 Canada        1998 30155173
##  8 Chile         1998 14977733
##  9 Sri Lanka     1998 18564599
## 10 Malawi        1998 10552338
## # … with 116 more rows
#Mean total deaths from 1996-2017 of high-population countries
deaths_highpop_countries <- deaths_df %>% 
  filter(country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_high = mean(total_deaths))
## Adding missing grouping variables: `country`
#Mean total deaths from 1990-2017 of high-population countries
deaths_lowpop_countries<- deaths_df %>% 
  filter(country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_low = mean(total_deaths))
## Adding missing grouping variables: `country`
#death_lowpop_countries
kable(list(deaths_highpop_countries, deaths_lowpop_countries))
country average_death_high
Australia 17.76815
Brazil 48.42928
Germany 28.10988
Nigeria 112.30157
Pakistan 144.33463
United States 26.35827
country average_death_low
Canada 18.18542
Chile 36.51321
Malawi 147.77167
New Zealand 15.92536
Serbia 80.66558
Sri Lanka 69.60383
ggplot(deaths_highpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_high))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in high-population countries")+
  coord_flip()

ggplot(deaths_lowpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_low))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in low-population countries")+
  coord_flip()

This shows us the deaths due to pollution, but what about the average population of those countries at that time?

hp_countries_population <- world_pop %>% 
  filter(Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia'), Year > 1996) %>% 
  group_by(Country.Name) %>% 
  select(Count) %>% 
  summarize(average_population = mean(Count))
## Adding missing grouping variables: `Country.Name`
#hp_countries_population

lp_countries_population <- world_pop %>% 
  filter(Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'), Year > 1996) %>% 
  group_by(Country.Name) %>% 
  select(Count) %>% 
  summarize(average_population = mean(Count))
## Adding missing grouping variables: `Country.Name`
#Population Average Table
kable(list(hp_countries_population, lp_countries_population))
Country.Name average_population
Australia 21217772
Brazil 189132292
Germany 81914540
Nigeria 148549958
Pakistan 168525322
United States 300447600
Country.Name average_population
Canada 33029774
Chile 16555805
Malawi 13605376
New Zealand 4214995
Serbia 7345882
Sri Lanka 19824652
#Graph of Population Average
ggplot(hp_countries_population)+
  geom_col(mapping = aes(x=Country.Name, y=average_population))+
             xlab("Country")+
             ylab("Average Population")+
             ggtitle("Average high-population countries")+
  coord_flip()

ggplot(lp_countries_population)+
  geom_col(mapping = aes(x=Country.Name, y=average_population))+
             xlab("Country")+
             ylab("Average Population")+
             ggtitle("Average low-population countries")+
  coord_flip()

#Join the data sets so we can overlay the two graph or do a stacked barchart?

Summary